On cherche à étudier comparer des méthode d’inférence de réseaux (co-expression et régulation).
## corrplot 0.84 loaded
load(paste0("./GenesCO2_", specie, ".RData"))
load("./normalized.count_At.RData")
# quantification file
data <- read.csv("quantifFiles/QuantifGenes.csv", h = T, sep = ",")
rownames(data) <- data$Gene
genes = which(!(grepl("__", rownames(data))))
not_quant = data[which((grepl("__", rownames(data)))), ]
data = data[genes, grepl("R", colnames(data))]
keep <- rowSums(data) >= 10
data <- data[keep, ]
group <- sapply(colnames(data), getLabel, with.rep = F)
colnames(data) <- sapply(colnames(data), getLabel)
specie = "At"
clusteredGenes <- clustering(sharedBy3, data)****************************************
coseq analysis: Poisson approach & none transformation
K = 2 to 12
Use set.seed() prior to running coseq for reproducible results.
****************************************
Running g = 2 ...
[1] "Initialization: 1"
[1] "Log-like diff: 0"
Running g = 3 ...
[1] "Initialization: 1"
[1] "Log-like diff: 4.90653064844082e-08"
Running g = 4 ...
[1] "Initialization: 1"
[1] "Log-like diff: 9.85664883046411e-11"
Running g = 5 ...
[1] "Initialization: 1"
[1] "Log-like diff: 3.65335921514998e-07"
Running g = 6 ...
[1] "Initialization: 1"
[1] "Log-like diff: 0"
Running g = 7 ...
[1] "Initialization: 1"
[1] "Log-like diff: 199.744202742663"
[1] "Log-like diff: 158.733444037593"
[1] "Log-like diff: 61.6211297058271"
[1] "Log-like diff: 95.47019752706"
[1] "Log-like diff: 78.9880838071986"
Running g = 8 ...
[1] "Initialization: 1"
[1] "Log-like diff: 9.35582988859096e-09"
Running g = 9 ...
[1] "Initialization: 1"
[1] "Log-like diff: 2.55795384873636e-13"
Running g = 10 ...
[1] "Initialization: 1"
[1] "Log-like diff: 1.28659394249553e-09"
Running g = 11 ...
[1] "Initialization: 1"
[1] "Log-like diff: 2.8421709430404e-14"
Running g = 12 ...
[1] "Initialization: 1"
[1] "Log-like diff: 2.55795384873636e-13"
$ICL
$profiles
$boxplots
$probapost_barplots
*************************************************
Model: Poisson
Transformation: none
*************************************************
Clusters fit: 2,3,4,5,6,7,8,9,10,11,12
Clusters with errors: ---
Selected number of clusters via ICL: 12
ICL of selected model: -740101.6
*************************************************
Number of clusters = 12
ICL = -740101.6
*************************************************
Cluster sizes:
Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6 Cluster 7
12 11 22 9 6 10 3
Cluster 8 Cluster 9 Cluster 10 Cluster 11 Cluster 12
2 13 18 23 2
Number of observations with MAP > 0.90 (% of total):
131 (100%)
Number of observations with MAP > 0.90 per cluster (% of total per cluster):
Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5 Cluster 6 Cluster 7
12 11 22 9 6 10 3
(100%) (100%) (100%) (100%) (100%) (100%) (100%)
Cluster 8 Cluster 9 Cluster 10 Cluster 11 Cluster 12
2 13 18 23 2
(100%) (100%) (100%) (100%) (100%)
NULL
Model-Based Clustering Using MPLN (Parallelized) Description Performs clustering using mixtures of multivariate Poisson-log normal (MPLN) distribution and model selection using AIC, AIC3, BIC and ICL. Since each component/cluster size (G) is independent from another, all Gs in the range to be tested have been parallelized to run on a seperate core using the parallel R package.
Class: pca dudi
Call: dudi.pca(df = log(data + 0.1), center = TRUE, scale = TRUE, scannf = FALSE,
nf = 4)
Total inertia: 24
Eigenvalues:
Ax1 Ax2 Ax3 Ax4 Ax5
17.2861 4.7060 0.6320 0.5028 0.2615
Projected inertia (%):
Ax1 Ax2 Ax3 Ax4 Ax5
72.025 19.608 2.633 2.095 1.089
Cumulative projected inertia (%):
Ax1 Ax1:2 Ax1:3 Ax1:4 Ax1:5
72.03 91.63 94.27 96.36 97.45
(Only 5 dimensions (out of 24) are shown)
NULL
## PNLModels
groups <- str_split_fixed(colnames(data), "_", 2)[, 1]
co2 <- str_split_fixed(groups, "", 3)[, 1]
nitrate <- factor(str_split_fixed(groups, "", 3)[, 2])
nitrate <- relevel(nitrate, "N")
fer <- factor(str_split_fixed(groups, "", 3)[, 3])
fer = relevel(fer, "F")
covariates <- data.frame(row.names = colnames(data), co2, nitrate, fer)
DEGenes <- sharedBy3
# preparation des données
counts <- round(t(data[DEGenes, ]), 0)
plnData <- prepare_data(counts = counts, covariates = covariates)
PCA_models <- PLNPCA(Abundance ~ 1 + nitrate + fer + co2 + offset(log(Offset)), data = plnData,
ranks = 1:10)
Initialization...
Adjusting 10 PLN models for PCA analysis.
Rank approximation = 1
Rank approximation = 2
Rank approximation = 3
Rank approximation = 4
Rank approximation = 5
Rank approximation = 6
Rank approximation = 7
Rank approximation = 8
Rank approximation = 9
Rank approximation = 10
Post-treatments
DONE!
--------------------------------------------------------
COLLECTION OF 10 POISSON LOGNORMAL MODELS
--------------------------------------------------------
Task: Principal Component Analysis
========================================================
- Ranks considered: from 1 to 10
- Best model (greater BIC): rank = 10 - R2 = 0.97
- Best model (greater ICL): rank = 10 - R2 = 0.97
myPCA_ICL <- getBestModel(PCA_models, "ICL")
plot(myPCA_ICL, ind_cols = groups, var_cols = factor(clusteredGenes[rownames(data[DEGenes,
])]))gridExtra::grid.arrange(plot(myPCA_ICL, ind_cols = groups, map = "individual", plot = FALSE),
plot(myPCA_ICL, var_cols = factor(clusteredGenes[rownames(data[DEGenes, ])]),
map = "variable", plot = FALSE), ncol = 2)TF <- read.table("TFs_PlnTFDB.txt", h = T, sep = "\t")
TF$AGI <- str_split_fixed(TF$Protein.ID, "\\.", 2)[, 1]
net_norm <- PLN_network(data = data, DEGenes = sharedBy3)
Initialization...
Adjusting 30 PLN with sparse inverse covariance estimation
Joint optimization alternating gradient descent and graphical-lasso
sparsifying penalty = 1.673463
sparsifying penalty = 1.545729
sparsifying penalty = 1.427745
sparsifying penalty = 1.318766
sparsifying penalty = 1.218106
sparsifying penalty = 1.125129
sparsifying penalty = 1.039249
sparsifying penalty = 0.9599238
sparsifying penalty = 0.8866536
sparsifying penalty = 0.8189761
sparsifying penalty = 0.7564644
sparsifying penalty = 0.6987241
sparsifying penalty = 0.6453911
sparsifying penalty = 0.5961289
sparsifying penalty = 0.5506269
sparsifying penalty = 0.508598
sparsifying penalty = 0.4697772
sparsifying penalty = 0.4339195
sparsifying penalty = 0.4007988
sparsifying penalty = 0.3702062
sparsifying penalty = 0.3419487
sparsifying penalty = 0.315848
sparsifying penalty = 0.2917396
sparsifying penalty = 0.2694714
sparsifying penalty = 0.2489028
sparsifying penalty = 0.2299043
sparsifying penalty = 0.2123559
sparsifying penalty = 0.196147
sparsifying penalty = 0.1811752
sparsifying penalty = 0.1673463
Post-treatments
DONE!
Stability Selection for PLNnetwork:
subsampling: ++++++++++++++++++++Poisson Lognormal with sparse inverse covariance (penalty = 0.167)
==================================================================
nb_param loglik BIC ICL R_squared n_edges EBIC pen_loglik
955 -21692.39 -23209.91 -17590.32 1 431 -23852.85 -21788.51
density
0.05
==================================================================
* Useful fields
$model_par, $latent, $var_par, $optim_par
$loglik, $BIC, $ICL, $loglik_vec, $nb_param, $criteria
* Useful S3 methods
print(), coef(), sigma(), vcov(), fitted(), predict(), standard_error()
* Additional fields for sparse network
$EBIC, $density, $penalty
* Additional S3 methods for network
plot.PLNnetworkfit()
V(net_norm)$color <- clusteredGenes[V(net_norm)]
plot.igraph(net_norm, vertex.size = 10, vertex.label.cex = 0.5, edge.width = 0.5)[1] 431
data <- toVisNetworkData(net_norm)
data$nodes$size = 10
data$nodes$font.size = 10
visNetwork(nodes = data$nodes, edges = data$edges, height = "500px", width = "100%") %>%
visEdges(smooth = FALSE) %>% visPhysics(solver = "forceAtlas2Based", timestep = 1,
minVelocity = 10, maxVelocity = 10, stabilization = F) %>% visOptions(selectedBy = "group",
highlightNearest = TRUE, nodesIdSelection = TRUE)data$nodes$group <- ifelse(data$nodes$id %in% TF$AGI, 1, 0)
visNetwork(nodes = data$nodes, edges = data$edges, height = "500px", width = "100%") %>%
visEdges(smooth = FALSE) %>% visPhysics(solver = "forceAtlas2Based", timestep = 1,
minVelocity = 10, maxVelocity = 10, stabilization = F) %>% visOptions(selectedBy = "group",
highlightNearest = TRUE, nodesIdSelection = TRUE)Method : The identification of the regulatory genes for a given target gene is defined as determining the subset of genes whose expression directly influences or is predictive of the expression of the target gene. Within the framework of supervised learning, this problem is equivalent to a feature selection problem. In this context, our solution will exploit the embedded feature ranking mechanism of tree-based ensemble methods.
Feature importance : We consider a measure which at each test node computes the total reduction of the variance of the output variable due to the split, and then average it for all trees. (equivament to oob, but less computationnally expensive)
Thresholding : A practical network prediction is then obtained by setting a threshold on this ranking. In this paper, we focus only on the first task […]. The question of the choice of an optimal confidence threshold, although important, will be left open.
Normalisation : To avoid biases, we have first normalized the gene expressions so that they all have a unit variance in the training set, before applying the tree-based ensemble methods. This normalization indeed implies that the different weights inferred from different models predicting the different gene expressions are comparable.
https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0012776
library(GENIE3)
for (thr in seq(0.05, 0.39, by = 0.05)) {
print(thr)
genie(normalized.count, regressors = intersect(TF$AGI, sharedBy3), targets = sharedBy3,
thr = thr)
}[1] 0.05
[1] 0.1
[1] 0.15
[1] 0.2
[1] 0.25
[1] 0.3
[1] 0.35
net <- genie(normalized.count, regressors = intersect(TF$AGI, sharedBy3), targets = sharedBy3,
thr = 0.25)data <- toVisNetworkData(net)
ont <- OntologyProfile(V(net)$name, plot = F)
data$nodes$Ontology <- ont[match(ont$ensembl_gene_id, data$nodes$id), ]$external_gene_name
data$nodes$description <- ont[match(ont$ensembl_gene_id, data$nodes$id), ]$description
visNetwork(nodes = data$nodes, edges = data$edges, height = "500px", width = "100%") %>%
visEdges(smooth = FALSE) %>% visPhysics(solver = "forceAtlas2Based", timestep = 1,
minVelocity = 10, maxVelocity = 10, stabilization = F) %>% visOptions(selectedBy = c("Ontology"),
highlightNearest = TRUE, nodesIdSelection = TRUE)V(net)$color <- clusteredGenes[V(net)]
data <- toVisNetworkData(net)
visNetwork(nodes = data$nodes, edges = data$edges, height = "500px", width = "100%") %>%
visEdges(smooth = FALSE) %>% visPhysics(solver = "forceAtlas2Based", timestep = 1,
minVelocity = 10, maxVelocity = 10, stabilization = F) %>% visOptions(selectedBy = "group",
highlightNearest = TRUE, nodesIdSelection = TRUE, collapse = TRUE)